## Ryan Copus and Ryan Hübert
## Measuring How Much Judges Matter for Case Outcomes
## Journal of Law and Courts

# Note to reader: please review the README for important information about 
# software and other requirements

################################################################################
# SCRIPT-SPECIFIC USER-SPECIFIED VARIABLES
################################################################################

# Do you want to save the plots and tables?
# If so, please set `to.save <- TRUE` below
# Note: the exists() test is checking whether this script is being run passively 
#       via the 0.FullReplication.R script, which sets its own toggle for to.save
if(!exists("to.save")) {
  # Clear workspace (comment out if you do not want to do this for some reason)
  rm(list=ls())
  # Do you want to save the plots and tables?
  to.save <- TRUE
} else {
  # If this script is being executed passively via the 0.FullReplication.R script
  message(paste0("This script will", ifelse(to.save,""," NOT"), 
                 " save plots and tables."))
  Sys.sleep(5)
}

################################################################################
# LOAD UTILITIES, PACKAGES AND DATA
################################################################################

# Load utils and user-supplied variables
source(here::here("Code/_config.R"))
source(here::here("Code/_utils.R"))

# Load required libraries
library("tidyverse") # for data manipulation and plotting
library("pROC") # for ROC curves
library("wfe") # for regressions

# Keep any object we have loaded above
keep.obj <- c(ls(), "keep.obj")

################################################################################
# TABLES C1 and C2 IN THE APPENDIX
################################################################################

# Load dataset and randomization predictions
load("Data/9C_DATA.rdata")
load("Data/PREDICTORS.rdata")

# Replace "MISSING" with NA
df[] <- lapply(df, function(x) ifelse(grepl("MISSING", as.character(x)), NA, as.character(x)))

# Ensure factor variable types are correct
fac.vars <- c('APPTYPE', 'JURIS', 'NOS', 'DDIST', 'DOFFICE', 'JOINAPP', 'd.SECTION', 
              'd.SUBSECT', 'd.COUNTY', 'd.TRCLACT', 'd.PROCPROG', 'd.DISP', 'd.NOJ', 
              'd.JUDGMENT', 'd.PROSE', 'd.IFP', 'd.PLRES', 'd.DEFRES', 'd.JURY', 'd.jud')
df[fac.vars] <- lapply(df[fac.vars], as.factor)

# Ensure numeric variable types are correct
num.vars <- c('CITY.APPELLEE', 'CITY.APPELLANT', 'BANK.APPELLEE', 'BANK.APPELLANT', 
              'STATE.APPELLEE', 'STATE.APPELLANT', 'APLEPROSE', 'MULTIPLE.APPELLANT', 
              'MULTIPLE.APPELLEE', 'USAPT', 'USAPE', 
              'CIVILRIGHTS', 'PI', 'SS', 'TORT', 'FORF', 'LABOR', 'CONTRACT', 'RP', 
              'd.pwon', 'd.TRANSFERRED', 'd.CLASSACT', 'd.ARBIT', 'd.republican', 
              'd.woman', 'd.black', 'd.nw_nb', 'd.exp_prosecutor', 'd.exp_defender', 
              'd.exp_jud_state', 'd.exp_jud_mag', 'd.exp_jud_oth_fed', 'd.exp_civr', 
              'd.exp_mil', 'd.PLT.APPELLAN', 'd.DEF.APPELLAN', 
              'p.rep', 'p.dem', 'p.rep.div', 'p.dem.div', 'p.maj.rep', 'p.maj.r.div', 
              'p.maj.d.div', 'p.rep.div.any', 'p.dem.div.any', 'p.seniors', 'p.women', 
              'p.black', 'p.hispanic', 'p.asian', 'p.pacific', 'p.non.white', 'p.rep.black', 
              'p.rep.hispanic', 'p.rep.non.white', 'p.rep.other.non.white', 
              'p.rep.white.male', 'p.rep.white.female', 'p.dem.black', 
              'p.dem.hispanic', 'p.dem.non.white', 'p.dem.other.non.white', 
              'p.dem.white.male', 'p.dem.white.female', 'p.Bush41', 'p.Bush43', 
              'p.Carter', 'p.Eisenhower', 'p.Clinton', 'p.Ford', 'p.Johnson', 
              'p.Kennedy', 'p.Nixon', 'p.Obama', 'p.Reagan', 'p.EWQ', 'p.WQ', 
              'p.Q', 'p.NQ', 'p.noneQ', 'p.dime.median', 'p.dime.mean', 'p.dime.min', 
              'p.dime.max')
df[num.vars] <- lapply(df[num.vars], as.numeric)

# Only need predictors
df <- df[,c(app.predictors, dist.predictors, panel.predictors)]

# This function creates a nicely formatted LaTeX table
create_latex_table <- function(datafr, digits = 3, caption = "", label = "", panel = FALSE) {
  # Format numeric columns
  df_fmt <- datafr %>%
    mutate(across(where(is.numeric), ~ sprintf(paste0("%.",digits,"f"), .x))) %>%
    replace(is.na(.) | .=="NA", "--")  # Replace NA with empty string
  
  # Build LaTeX lines
  header <- paste(colnames(df_fmt), collapse = " & ")
  rows <- apply(df_fmt, 1, function(row) paste(row, collapse = " & "))
  
  # Combine everything into tabular
  tabular_body <- paste(c(header, "\\hline", rows), collapse = " \\\\\n")
  
  latex_table <- paste0("\\begin{table}\\tiny\n\\caption{", 
                        caption,
                        "}\\label{",
                        label, 
                        "}\\begin{tabular}{", paste(rep("l", ncol(df_fmt)), collapse = ""), "}\n",
                        tabular_body,
                        ifelse(panel, "", ""), 
                        " \\\\\n\\end{tabular}\n\\end{table}")
  latex_table <- str_replace_all(latex_table, "([_%#])", "\\\\\\1") 
  return(latex_table)
}

# Create a dataframe with summary statistics 
qf <- df %>%
  select(all_of(num.vars)) %>% 
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  summarize(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), .groups = "drop") %>%
  mutate(mode = NA, mode.perc = NA) %>% 
  bind_rows(df %>%
              select(all_of(fac.vars)) %>% 
              pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
              group_by(variable, value) %>%
              summarize(n = n()) %>%
              ungroup() %>%
              group_by(variable) %>%
              mutate(mode.perc = n/sum(n)) %>%
              arrange(variable, desc(n)) %>%
              filter(row_number() == 1) %>%
              ungroup() %>%
              mutate(mean = NA, sd = NA, mode = as.character(value)) %>% 
              select(variable, mean, sd, mode, mode.perc)) %>%
  full_join(mutate(read_csv("Data/predictor_info.csv", col_types = cols(.default = col_character())), order = row_number()),
            by = "variable") %>%
  mutate(description = case_when(orig_deriv == "Derivative" ~ paste0(as.character(description), "$^*$"), TRUE ~ as.character(description))) %>%
  arrange(order)

# Nice column names
nice.cols <- c("Variable Description", "Variable Name", "Mean", "Std. Dev.",
               "Modal Value", "Modal Value %", "Variable Source")

# Create a dataframe with case characteristic summary statistics 
caption <- paste0(
  "List of case characteristics we use as predictors in our models, ",
  "including information about their source. Variables marked with a star ",
  "were created by the authors using data in the source dataset, whereas ",
  "variables without a star are original variables from the source dataset ",
  "(with minimal cleaning)."
)
tp <- qf %>%
  filter(type == "case") %>%
  select(description, variable, mean, sd, mode, mode.perc, source) %>%
  set_names(nice.cols) %>% 
  create_latex_table(caption = caption, label = "tab:CasePredictors") %>% 
  write_file(file = 'Outputs/tableC1.tex')

# Create a dataframe with panel characteristic summary statistics 
tp <- qf %>%
  filter(type == "panel") %>%
  select(description, variable, mean, sd, mode, mode.perc, source) %>%
  set_names(nice.cols) %>% 
  create_latex_table(caption = gsub(" case characteristics ", " panel characteristics ", caption), 
                     label = "tab:PanelPredictors") %>% 
  write_file(file = 'Outputs/tableC2.tex')

################################################################################
# FIGURE 3: TESTING RANDOMIZATION
################################################################################

# Load dataset and randomization predictions
load("Data/9C_DATA.rdata")
load("Data/randomization.rdata")

tf <- left_join(pf, df[,c("row.id", "p.maj.rep", "FE")]) 
rm(pf,df)

roc1 <- roc(tf$p.maj.rep, tf$rando.base)
roc2 <- roc(tf$p.maj.rep, tf$rando.full)
rf <- bind_rows(tibble(tpr = roc1$sensitivities, fpr = 1-roc1$specificities, Model = "Fixed Effects Only"),
                tibble(tpr = roc2$sensitivities, fpr = 1-roc2$specificities, Model = "Fixed Effects Plus Predictors"))

z1 <- ggplot(rf, aes(y = tpr, x = fpr, group = Model, color = Model)) + 
  geom_abline(slope = 1, intercept = 0, linewidth = 0.3, linetype = "dashed") + 
  geom_line() +
  labs(y = "True Positive Rate", x = "False Positive Rate") + 
  scale_x_continuous(expand = c(0.01,0.01)) + 
  scale_y_continuous(expand = c(0.01,0.01)) + 
  scale_color_manual(values = c("grey60", "black")) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        axis.title = element_text(size=12),
        legend.text = element_text(size=10),
        legend.title = element_text(size=12),
        strip.background = element_blank()) 

if(to.save){
  pdf("Outputs/fg3.pdf", width=4.75, height=2.5, family = "Times")
  print(z1)
  dev.off()
}

# An additional test of randomization (see section 2.4 of the paper)
load("Data/deviance.rdata")
tf <- left_join(tf, pf[,c("row.id", "expected")]) 
load("Data/perc.rdata")
tf <- left_join(tf, pf[,c("row.id", "panel.perc")]) 

mod <- lm(panel.perc ~ expected + FE, tf)
print(summary(mod)$coefficients[2,])

rm(list = setdiff(ls(), keep.obj))

################################################################################
# FIGURE 4: FACE VALIDITY
################################################################################

# Note to reader: the following figure cannot be reproduced as the judge 
# identifiers have been redacted. See the README.

# Load dataset and results
load("Data/9C_DATA.rdata")
load("Data/perc.rdata")

juds <- c("JXXX", "JXXX") ## All judge identifiers have been anonymized/redacted

tf <- left_join(df, pf) %>% 
  select(row.id, panel.base, panel.perc) %>%
  mutate(Panels = NA) %>% 
  mutate(Panels = if_else(str_detect(panel.base, juds[1]), "Panels that Include Judge Kozinski", Panels)) %>% 
  mutate(Panels = if_else(str_detect(panel.base, juds[2]), "Panels that Include Judge Reinhardt", Panels)) %>% 
  filter(!is.na(Panels))

z1 <- ggplot(tf) + 
  geom_density(aes(x = panel.perc, group = Panels, fill = Panels), alpha = 0.3) + 
  labs(x = "Panel Reversal Quantiles (PRQs)", y = "Density") +
  scale_y_continuous(limits = c(0,7), breaks=seq(0,7,1)) +
  scale_fill_manual(values = c("grey10", "gray70")) + 
  theme_bw() + 
  theme(title = element_text(size=12),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.text = element_text(size=10),
        legend.title = element_text(size=12),
        axis.text = element_text(size=10),
        axis.title = element_text(size=12),
        strip.background = element_blank())

if(to.save){
  pdf("Outputs/fg4.pdf", width=6.5, height=2, family = "Times")
  print(z1)
  dev.off()
}

rm(list = setdiff(ls(), keep.obj))

################################################################################
# FIGURE 5: COMPARING PRQs AND DIME SCORES
################################################################################

# Load dataset and results
load("Data/9C_DATA.rdata")
load("Data/perc.rdata")

tf <- left_join(df, pf) %>%
  select(panel.perc, negative, p.dime.median) %>% 
  mutate(dime.perc = 1-ecdf(p.dime.median)(p.dime.median))

# Create dataset for the plot
z <- seq(0, 0.95, 0.05)
res2 <- c()
res3 <- c()
for(i in z){
  mask1 <- (i < tf$panel.perc & tf$panel.perc <= (i+0.05))
  res2 <- c(res2, mean(tf$negative[mask1]))
  mask2 <- (i < tf$dime.perc & tf$dime.perc <= (i+0.05))
  res3 <- c(res3, mean(tf$negative[mask2]))
}
res3 <- bind_rows(bind_cols(x = z+0.05, y = res2, `Treatment Variable` = "PRQ"),
                  bind_cols(x = z+0.05, y = res3, `Treatment Variable` = "DIME"))

cors <- res3 %>%
  mutate(maxy = max(y) + if_else(`Treatment Variable` == "DIME", -0.005, 0.025)) %>%
  group_by(`Treatment Variable`) %>%
  summarize(cor = cor(x, y), y = max(maxy))

z1 <- ggplot(res3, aes(x = x, y = y, group = `Treatment Variable`, shape = `Treatment Variable`, color = `Treatment Variable`)) + 
  geom_point(size = 2) + 
  stat_smooth(method = "lm", formula = "y ~ x", se = FALSE, linewidth=1) +
  geom_text(data = cors, aes(x = 0.3, y = y, label = paste0("Correlation for ",`Treatment Variable`,": ",sprintf(paste0("%.2f"), round(cor, 2)))), size = 8 / .pt) +
  labs(x = "Treatment Variable Decile", y = "Reversal Rate") +
  geom_hline(yintercept = mean(df$negative), linetype = "dashed") +
  scale_x_continuous(breaks=seq(0,1,0.1), labels=10*seq(0,1,0.1)) +
  scale_y_continuous(limits = c(0.20,0.50), breaks=seq(0.20,0.50,0.05)) +
  scale_color_manual(values = c("grey60", "black")) +
  theme_bw() + 
  theme(title = element_text(size=12),
        plot.title = element_text(hjust = 0.5),
        legend.text = element_text(size=10),
        legend.title = element_text(size=12),
        panel.grid.minor = element_blank(),
        axis.text = element_text(size=10),
        axis.title = element_text(size=12),
        strip.background = element_blank())

if(to.save){
  pdf("Outputs/fg5.pdf", width=6.5, height=2, family = "Times")
  print(z1)
  dev.off()
}

rm(list = setdiff(ls(), keep.obj))

################################################################################
# FIGURE 6: COMPARING QUANTILES
################################################################################

# Load dataset and results
load("Data/9C_DATA.rdata")
load("Data/perc.rdata")

tf <- left_join(df, pf) %>% 
  select(panel.perc, negative, FE)

# Create data for plot
tf$quantile <- 1
tf$quantile[tf$panel.perc > .2 & tf$panel.perc <= .4] <- 2
tf$quantile[tf$panel.perc > .4 & tf$panel.perc <= .6] <- 3
tf$quantile[tf$panel.perc > .6 & tf$panel.perc <= .8] <- 4
tf$quantile[tf$panel.perc > .8] <- 5
ab <- NULL
for(i in 2:5){
  compare <- c(1, i)
  test <- tf[tf$quantile %in% compare, ]
  test$treat <- ifelse(test$quantile == max(test$quantile), 1, 0)
  test$FE <- factor(test$FE)
  reg <- wfe(negative ~ treat, test, treat = "treat", unit.index = "FE", qoi = "ate")
  a <- c(reg$coefficients[[1]], reg[7], i)
  ab <- rbind(ab, a)
}

ab <- as.data.frame(ab)
ab$V1 <- unlist(ab$V1)
ab$se <- unlist(ab$se)
ab$V3 <-unlist(ab$V3)

names(ab) <- c("effect", "SE", "Quantile")

z1 <- ggplot(ab, aes(x = Quantile, y = effect, group = Quantile)) + 
  geom_point(size = 3, position = position_dodge2(width = 0.2, preserve = "single")) + 
  geom_errorbar(aes(ymin = effect - 1.96*SE, ymax = effect + 1.96*SE), width = 0.2, position = "dodge") +
  geom_text(aes(y = effect + 1.96*SE + 0.022, label = sprintf(paste0("%.3f"), round(effect, 3))), size = 8 / .pt) +
  geom_text(aes(y = effect + 1.96*SE + 0.010, label = paste0("(",sprintf(paste0("%.3f"), round(SE, 3)),")")), size = 8 / .pt) +
  labs(x = "Panel Quintile", y = "Effect on Reversal Rate\nRelative to Lowest Quintile", color = "Circuit") +
  scale_x_continuous(breaks = c(2, 3, 4, 5)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_bw() + 
  theme(axis.title = element_text(size=12),
        axis.text = element_text(size=10),
        strip.background = element_blank())

if(to.save){
  pdf("Outputs/fg6.pdf", width=6, height=2.5, family = "Times")
  print(z1)
  dev.off()
}

rm(list = setdiff(ls(), c(keep.obj, "tf")))

################################################################################
# FIGURE 7: COMPARING EXTREME PRQs
################################################################################

# Create data for plot
margins <- c(.01, .02, .03, .04, .05, .1)
ab <- NULL
for(margin in margins){
  # margin <- 0.1
  test <- tf[tf$panel.perc >= 1-margin | tf$panel.perc <= 0+margin, ]
  test$treat <- ifelse(test$panel.perc >= 1-margin, 1, 0)
  test$FE <- factor(test$FE)
  reg <- wfe(negative ~ treat, test, treat = "treat", unit.index = "FE", qoi = "ate")
  a <- c(reg$coefficients[[1]], reg[7], margin) # Coefficient, SE, margin
  ab <- rbind(ab, a)
}

ab <- as.data.frame(ab)
ab$V1 <- unlist(ab$V1)
ab$se <- unlist(ab$se)
ab$V3 <-unlist(ab$V3)

names(ab) <- c("effect", "SE", "Margin")
ab$Margin.C <- c(6, 5, 4, 3, 2, 1)
ab$Margin.C <- factor(ab$Margin.C)

z1 <- ggplot(ab, aes(x = Margin.C, y = effect, group = Margin.C)) + 
  geom_point(size = 3, position = position_dodge2(width = 0.2, preserve = "single")) +
  geom_errorbar(aes(ymin = effect - 1.96*SE, ymax = effect + 1.96*SE), width = 0.2, position = position_dodge2(width = 0.2, preserve = "single")) +
  geom_text(aes(y = effect + 1.96*SE + 0.055, label = sprintf(paste0("%.3f"), round(effect, 3))), size = 8 / .pt) +
  geom_text(aes(y = effect + 1.96*SE + 0.022, label = paste0("(",sprintf(paste0("%.3f"), round(SE, 3)),")")), size = 8 / .pt) +
  labs(x = "Assignment to Panel in Top v. Bottom X Quantile", y = "Effect on Reversal Rate", color = "Circuit") +
  scale_x_discrete(labels = c("10%", "5%", "4%", "3%", "2%", "1%")) +
  scale_y_continuous(breaks = seq(0,0.6,0.1), labels = c("0",str_pad(seq(0.1,0.6,0.1), width = 4, "right", "0")) ) + 
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_bw() + 
  theme(axis.title = element_text(size=12),
        axis.text = element_text(size=10),
        strip.background = element_blank())

if(to.save){
  pdf("Outputs/fg7.pdf", width=6, height=2.5, family = "Times")
  print(z1)
  dev.off()
}

rm(list = setdiff(ls(), c(keep.obj, "tf")))

################################################################################
# FIGURE 8: CASE OUTCOMES AFTER RE-RANDOMIZATION
################################################################################

CalcAveInc <- function(sim){
  tf$group <- cut(tf$panel.perc, quantile(tf$panel.perc, probs = seq(0, 1, 1/sim)), include.lowest=T)
  coeffs <- lm(negative ~ group + FE, tf)$coefficients[2:sim]
  coeffs <- c(0, coeffs) # adding 0 for the reference category
  incs <- combn(coeffs,2)
  incs <- apply(incs, 2, diff) # marginal effect of reassignment to different group
  incs <- c(incs, rep(0, length(coeffs))) # allow for possibility of re-randomization to same group
  incs <- mean(incs)
  return(c(sim,incs))
}

print("Now estimating re-randomization---this will take time...")
sf <- parallel::mclapply(2:150, function(x) CalcAveInc(x))
sf <- data.frame(do.call(rbind, sf)) %>% set_names(c("s", "inc"))
print("... finished!")

z1 <- ggplot(sf, aes(x = s, y = inc)) + 
  geom_point(size = 1, color = "gray50") + 
  labs(x = "Number of Quantile Partitions", 
       y = "Minimum Percent of Case Outcomes\nChanged after Re-randomization") + 
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_hline(yintercept = 0.065, linetype = "dashed", color = "black") +
  annotate(geom = "text", x = 2, y = 0.068, 
           label = 0.065, color="black", hjust = 0.5, size = 10 / .pt) +
  scale_x_continuous(breaks = seq(0,150,10), labels = seq(0,150,10), limits=c(0,150)) +
  theme_bw() + 
  scale_y_continuous(limits = c(0.00,0.08), breaks = seq(0.0,0.08,0.01)) +
  theme(axis.title = element_text(size=12),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.text = element_text(size=10),
        legend.position = "none",
        strip.background = element_blank())

if(to.save){
  pdf("Outputs/fg8.pdf", width=6, height=3, family = "Times")
  print(z1)
  dev.off()
}

rm(list = setdiff(ls(), keep.obj))